library(data.table)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ purrr::transpose() masks data.table::transpose()
library(dplyr)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(knitr)
library(widgetframe)
## Loading required package: htmlwidgets
opts_chunk$set(
warning = FALSE,
message = FALSE,
eval=TRUE,
echo = TRUE,
cache=FALSE,
include=TRUE,
fig.width = 7,
fig.align = 'center',
fig.asp = 0.618,
out.width = "700px")
Read in and process the COVID dataset from the New York Times GitHub repository Create interactive graphs of different types using plot_ly() and ggplotly() functions Customize the hoverinfo and other plot features Create a Choropleth map using plot_geo()
We will work with COVID data downloaded from the New York Times. The dataset consists of COVID-19 cases and deaths in each US state during the course of the COVID epidemic.
**The objective of this lab is to explore relationships between cases, deaths, and population sizes of US states, and plot data to demonstrate this
## data extracted from New York Times state-level data from NYT Github repository
# https://github.com/nytimes/covid-19-data
## state-level population information from us_census_data available on GitHub repository:
# https://github.com/COVID19Tracking/associated-data/tree/master/us_census_data
### FINISH THE CODE HERE ###
# load COVID state-level data from NYT
cv_states <- as.data.frame(data.table::fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"))
### FINISH THE CODE HERE ###
# load state population data
state_pops <- as.data.frame(data.table::fread("https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv"))
state_pops$abb <- state_pops$state
state_pops$state <- state_pops$state_name
state_pops$state_name <- NULL
### FINISH THE CODE HERE
cv_states <- merge(cv_states, state_pops, by="state")
head, and tail of
the datadim(cv_states)
## [1] 51282 9
head(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 1 Alabama 2020-10-06 1 160477 2580 1 4887871 96.50939 AL
## 2 Alabama 2021-08-02 1 589110 11536 1 4887871 96.50939 AL
## 3 Alabama 2021-04-13 1 520503 10722 1 4887871 96.50939 AL
## 4 Alabama 2021-06-08 1 546540 11220 1 4887871 96.50939 AL
## 5 Alabama 2021-06-10 1 547135 11252 1 4887871 96.50939 AL
## 6 Alabama 2022-08-30 1 1479605 20048 1 4887871 96.50939 AL
tail(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 51277 Wyoming 2022-01-28 56 144526 1625 56 577737 5.950611 WY
## 51278 Wyoming 2021-10-19 56 98567 1136 56 577737 5.950611 WY
## 51279 Wyoming 2022-02-17 56 153850 1689 56 577737 5.950611 WY
## 51280 Wyoming 2022-09-13 56 175746 1888 56 577737 5.950611 WY
## 51281 Wyoming 2021-12-31 56 115638 1526 56 577737 5.950611 WY
## 51282 Wyoming 2020-04-27 56 520 7 56 577737 5.950611 WY
str(cv_states)
## 'data.frame': 51282 obs. of 9 variables:
## $ state : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ date : IDate, format: "2020-10-06" "2021-08-02" ...
## $ fips : int 1 1 1 1 1 1 1 1 1 1 ...
## $ cases : int 160477 589110 520503 546540 547135 1479605 551298 1528739 1153149 1531305 ...
## $ deaths : int 2580 11536 10722 11220 11252 20048 11358 20505 16826 20533 ...
## $ geo_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ population : int 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
## $ pop_density: num 96.5 96.5 96.5 96.5 96.5 ...
## $ abb : chr "AL" "AL" "AL" "AL" ...
# format the date
cv_states$date <- as.Date(cv_states$date, format="%Y-%m-%d")
# format the state and state abbreviation (abb) variables
state_list <- unique(cv_states$state)
cv_states$state <- factor(cv_states$state, levels = state_list)
abb_list <- unique(cv_states$abb)
cv_states$abb <- factor(cv_states$abb, levels = abb_list)
### FINISH THE CODE HERE
# order the data first by state, second by date
cv_states = cv_states[order(cv_states$state, cv_states$date),]
# Confirm the variables are now correctly formatted
str(cv_states)
## 'data.frame': 51282 obs. of 9 variables:
## $ state : Factor w/ 52 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ date : Date, format: "2020-03-13" "2020-03-14" ...
## $ fips : int 1 1 1 1 1 1 1 1 1 1 ...
## $ cases : int 6 12 23 29 39 51 78 106 131 157 ...
## $ deaths : int 0 0 0 0 0 0 0 0 0 0 ...
## $ geo_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ population : int 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
## $ pop_density: num 96.5 96.5 96.5 96.5 96.5 ...
## $ abb : Factor w/ 52 levels "AL","AK","AZ",..: 1 1 1 1 1 1 1 1 1 1 ...
head(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 788 Alabama 2020-03-13 1 6 0 1 4887871 96.50939 AL
## 593 Alabama 2020-03-14 1 12 0 1 4887871 96.50939 AL
## 768 Alabama 2020-03-15 1 23 0 1 4887871 96.50939 AL
## 334 Alabama 2020-03-16 1 29 0 1 4887871 96.50939 AL
## 591 Alabama 2020-03-17 1 39 0 1 4887871 96.50939 AL
## 440 Alabama 2020-03-18 1 51 0 1 4887871 96.50939 AL
tail(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 50536 Wyoming 2022-11-07 56 178866 1914 56 577737 5.950611 WY
## 51197 Wyoming 2022-11-08 56 179366 1917 56 577737 5.950611 WY
## 51249 Wyoming 2022-11-09 56 179366 1917 56 577737 5.950611 WY
## 50945 Wyoming 2022-11-10 56 179366 1917 56 577737 5.950611 WY
## 50446 Wyoming 2022-11-11 56 179366 1917 56 577737 5.950611 WY
## 50817 Wyoming 2022-11-12 56 179366 1917 56 577737 5.950611 WY
# Inspect the range values for each variable. What is the date range? The range of cases and deaths?
head(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 788 Alabama 2020-03-13 1 6 0 1 4887871 96.50939 AL
## 593 Alabama 2020-03-14 1 12 0 1 4887871 96.50939 AL
## 768 Alabama 2020-03-15 1 23 0 1 4887871 96.50939 AL
## 334 Alabama 2020-03-16 1 29 0 1 4887871 96.50939 AL
## 591 Alabama 2020-03-17 1 39 0 1 4887871 96.50939 AL
## 440 Alabama 2020-03-18 1 51 0 1 4887871 96.50939 AL
summary(cv_states)
## state date fips cases
## Washington : 1027 Min. :2020-01-21 Min. : 1.00 Min. : 1
## Illinois : 1024 1st Qu.:2020-11-03 1st Qu.:16.00 1st Qu.: 89622
## California : 1023 Median :2021-07-07 Median :29.00 Median : 343328
## Arizona : 1022 Mean :2021-07-07 Mean :29.78 Mean : 815731
## Massachusetts: 1016 3rd Qu.:2022-03-11 3rd Qu.:44.00 3rd Qu.: 960676
## Wisconsin : 1012 Max. :2022-11-12 Max. :72.00 Max. :11411720
## (Other) :45158
## deaths geo_id population pop_density
## Min. : 0 Min. : 1.00 Min. : 577737 Min. : 1.292
## 1st Qu.: 1369 1st Qu.:16.00 1st Qu.: 1805832 1st Qu.: 43.659
## Median : 5119 Median :29.00 Median : 4468402 Median : 107.860
## Mean :11390 Mean :29.78 Mean : 6403921 Mean : 422.945
## 3rd Qu.:14412 3rd Qu.:44.00 3rd Qu.: 7535591 3rd Qu.: 229.511
## Max. :97142 Max. :72.00 Max. :39557045 Max. :11490.120
## NA's :975
## abb
## WA : 1027
## IL : 1024
## CA : 1023
## AZ : 1022
## MA : 1016
## WI : 1012
## (Other):45158
min(cv_states$date)
## [1] "2020-01-21"
max(cv_states$date)
## [1] "2022-11-12"
new_cases and new_deaths and
correct outliersnew_cases, and new deaths,
new_deaths:
new_cases equal to the difference
between cases on date i and date i-1, starting on date i=2plotly for EDA: See if there are outliers or values
that don’t make sense for new_cases and
new_deaths. Which states and which dates have strange
values?new_cases or
new_deaths to 0cases and deaths as cumulative
sum of updated new_cases and new_deaths# Add variables for new_cases and new_deaths:
for (i in 1:length(state_list)) {
cv_subset = subset(cv_states, state == state_list[i])
cv_subset = cv_subset[order(cv_subset$date),]
# add starting level for new cases and deaths
cv_subset$new_cases = cv_subset$cases[1]
cv_subset$new_deaths = cv_subset$deaths[1]
### FINISH THE CODE HERE
for (j in 2:nrow(cv_subset)) {
cv_subset$new_cases[j] = cv_subset$cases[j] - cv_subset$cases[j-1]
cv_subset$new_deaths[j] = cv_subset$deaths[j] - cv_subset$deaths[j-1]
}
# include in main dataset
cv_states$new_cases[cv_states$state==state_list[i]] = cv_subset$new_cases
cv_states$new_deaths[cv_states$state==state_list[i]] = cv_subset$new_deaths
}
# Focus on recent dates
cv_states <- cv_states %>% dplyr::filter(date >= "2022-06-01")
### FINISH THE CODE HERE
# Inspect outliers in new_cases using plotly
p1<-ggplot(cv_states, aes(x = date, y = new_cases, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p1)
p1<-NULL # to clear from workspace
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
p2<-NULL # to clear from workspace
# set negative new case or death counts to 0
cv_states$new_cases[cv_states$new_cases<0] = 0
cv_states$new_deaths[cv_states$new_deaths<0] = 0
# Recalculate `cases` and `deaths` as cumulative sum of updated `new_cases` and `new_deaths`
for (i in 1:length(state_list)) {
cv_subset = subset(cv_states, state == state_list[i])
# add starting level for new cases and deaths
cv_subset$cases = cv_subset$cases[1]
cv_subset$deaths = cv_subset$deaths[1]
### FINISH CODE HERE
for (j in 2:nrow(cv_subset)) {
cv_subset$cases[j] = cv_subset$new_cases[j] + cv_subset$cases[j-1]
cv_subset$deaths[j] = cv_subset$new_deaths[j] + cv_subset$deaths[j-1]
}
# include in main dataset
cv_states$cases[cv_states$state==state_list[i]] = cv_subset$cases
cv_states$deaths[cv_states$state==state_list[i]] = cv_subset$deaths
}
# Smooth new counts
cv_states$new_cases = zoo::rollmean(cv_states$new_cases, k=7, fill=NA, align='right') %>% round(digits = 0)
cv_states$new_deaths = zoo::rollmean(cv_states$new_deaths, k=7, fill=NA, align='right') %>% round(digits = 0)
# Inspect data again interactively
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
p2=NULL
Add population-normalized (by 100,000) variables for each
variable type (rounded to 1 decimal place). Make sure the variables you
calculate are in the correct format (numeric). You can use
the following variable names:
per100k = cases per 100,000 populationnewper100k= new cases per 100,000deathsper100k = deaths per 100,000newdeathsper100k = new deaths per 100,000Add a “naive CFR” variable representing
deaths / cases on each date for each state
Create a dataframe representing values on the most recent date,
cv_states_today, as done in lecture
### FINISH CODE HERE
# add population normalized (by 100,000) counts for each variable
cv_states$per100k = as.numeric(format(round(cv_states$cases/(cv_states$population/100000),1),nsmall=1))
cv_states$newper100k = as.numeric(format(round(cv_states$new_cases/(cv_states$population/100000),1),nsmall=1))
cv_states$deathsper100k = as.numeric(format(round(cv_states$deaths/(cv_states$population/100000),1),nsmall=1))
cv_states$newdeathsper100k = as.numeric(format(round(cv_states$new_deaths/(cv_states$population/100000),1),nsmall=1))
# add a naive_CFR variable = deaths / cases
cv_states = cv_states %>% mutate(naive_CFR = round((deaths*100/cases),2))
# create a `cv_states_today` variable
cv_states_today = subset(cv_states, date==max(cv_states$date))
plot_ly()plot_ly() representing
pop_density vs. various variables (e.g. cases,
per100k, deaths, deathsper100k)
for each state on most recent date (cv_states_today)
hovermode = "compare"### FINISH CODE HERE
# pop_density vs. cases
cv_states_today %>%
plot_ly(x = ~pop_density, y = ~cases,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
# filter out "District of Columbia"
cv_states_today_filter <- cv_states_today %>% filter(state!="District of Columbia")
# pop_density vs. cases after filtering
cv_states_today_filter %>%
plot_ly(x = ~pop_density, y = ~cases,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
# pop_density vs. deathsper100k
cv_states_today_filter %>%
plot_ly(x = ~pop_density, y = ~deathsper100k,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
# Adding hoverinfo
cv_states_today_filter %>%
plot_ly(x = ~pop_density, y = ~deathsper100k,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5),
hoverinfo = 'text',
text = ~paste( paste(state, ":", sep=""),
paste(" Cases per 100k: ", per100k, sep="") ,
paste(" Deaths per 100k: ", deathsper100k, sep=""), sep = "<br>")) %>%
layout(title = "Population-normalized COVID-19 deaths (per 100k) vs. population density for US states",
yaxis = list(title = "Deaths per 100k"),
xaxis = list(title = "Population Density"),
hovermode = "compare")
ggplotly() and geom_smooth()pop_density vs. newdeathsper100k
create a chart with the same variables using
gglot_ly()geom_smooth()
pop_density is a
correlate of newdeathsper100k?### FINISH CODE HERE
p <- ggplot(cv_states_today_filter, aes(x=pop_density, y=deathsper100k, size=population)) + geom_point() + geom_smooth()
ggplotly(p)
naive_CFR for all states
over time using plot_ly()
naive_CFR for
the states that had an increase in September. How have they changed over
time?new_cases and new_deaths together in one plot.
Hint: use add_layer()
### FINISH CODE HERE
# Line chart for naive_CFR for all states over time using `plot_ly()`
plot_ly(cv_states, x = ~date, y = ~naive_CFR, color = ~state, type = "scatter", mode = "lines")
### FINISH CODE HERE
# Line chart for Florida showing new_cases and new_deaths together
cv_states %>% filter(state=="Florida") %>% plot_ly(x = ~date, y = ~new_cases, type = "scatter", mode = "lines") %>%
add_lines(x = ~date, y = ~new_deaths, type = "scatter", mode = "lines")
Create a heatmap to visualize new_cases for each state
on each date greater than June 1st, 2021 - Start by mapping selected
features in the dataframe into a matrix using the tidyr
package function pivot_wider(), naming the rows and
columns, as done in the lecture notes - Use plot_ly() to
create a heatmap out of this matrix. Which states stand out? - Repeat
with newper100k variable. Now which states stand out? -
Create a second heatmap in which the pattern of new_cases
for each state over time becomes more clear by filtering to only look at
dates every two weeks
### FINISH CODE HERE
# Map state, date, and new_cases to a matrix
library(tidyr)
cv_states_mat <- cv_states %>% select(state, date, new_cases) %>% dplyr::filter(date>as.Date("2022-06-15"))
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = new_cases))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)
# Create a heatmap using plot_ly()
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
z=~cv_states_mat2,
type="heatmap",
showscale=T)
# Repeat with newper100k
cv_states_mat <- cv_states %>% select(state, date, newper100k) %>% dplyr::filter(date>as.Date("2022-06-15"))
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = newper100k))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
z=~cv_states_mat2,
type="heatmap",
showscale=T)
# Create a second heatmap after filtering to only include dates every other week
filter_dates <- seq(as.Date("2022-06-15"), as.Date("2022-11-01"), by="2 weeks")
cv_states_mat <- cv_states %>% select(state, date, newper100k) %>% filter(date %in% filter_dates)
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = newper100k))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)
# Create a heatmap using plot_ly()
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
z=~cv_states_mat2,
type="heatmap",
showscale=T)
naive_CFR by state on
October 15, 2021naive_CFR by state
on most recent datesubplot(). Make sure
the shading is for the same range of values (google is your friend for
this)### For specified date
pick.date = "2022-10-15"
# Extract the data for each state by its abbreviation
cv_per100 <- cv_states %>%
filter(date==pick.date) %>%
select(state, abb, newper100k, cases, deaths) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL
# Create hover text
cv_per100$hover <- with(cv_per100,
paste(state_name, '<br>',
"Cases per 100k: ", newper100k, '<br>',
"Cases: ", cases, '<br>',
"Deaths: ", deaths))
# Set up mapping details
set_map_details <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
# Make sure both maps are on the same color scale
shadeLimit <- 35
# Create the map
fig <- plot_geo(cv_per100, locationmode = 'USA-states') %>%
add_trace(
z = ~newper100k,
text = ~hover,
locations = ~state,
color = ~newper100k,
colors = 'Purples'
)
fig <- fig %>%
colorbar(title = "Cases per 100k", limits = c(0,shadeLimit))
fig <- fig %>%
layout(
geo = set_map_details
)
fig_pick.date <- fig
#############
### Map for today's date
# Extract the data for each state by its abbreviation
cv_per100 <- cv_states_today %>%
select(state, abb, newper100k, cases, deaths) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL
# Create hover text
cv_per100$hover <- with(cv_per100,
paste(state_name, '<br>',
"Cases per 100k: ", newper100k, '<br>',
"Cases: ", cases, '<br>',
"Deaths: ", deaths))
# Create the map
fig <- plot_geo(cv_per100, locationmode = 'USA-states') %>%
add_trace(
z = ~newper100k,
text = ~hover,
locations = ~state,
color = ~newper100k,
colors = 'Purples'
)
fig <- fig %>%
colorbar(title = "Cases per 100k", limits = c(0,shadeLimit))
fig <- fig %>%
layout(
geo = set_map_details
)
fig_Today <- fig
### Plot together
finalfig <- subplot(fig_pick.date, fig_Today, nrows = 2) %>%
layout(showlegend = FALSE,
title = paste('Cases per 100k by State',
'<br>(Hover for value)'),
hovermode = TRUE
) %>%
colorbar(title = "Cases per 100k", limits = c(0,shadeLimit))
annotations = list(
list(
x = 0.5,
y = 0.5,
text = pick.date,
xref = "paper",
yref = "paper",
xanchor = "center",
yanchor = "bottom",
showarrow = FALSE
),
list(
x = 0.5,
y = -0.05,
text = Sys.Date(),
xref = "paper",
yref = "paper",
xanchor = "center",
yanchor = "bottom",
showarrow = FALSE
))
finalfig <- finalfig %>%layout(annotations = annotations)
finalfig
Ideally, we would not repeat the colorbar twice.